Abstract
When using datasets gathered from mediums designed for casual conversation such as twitter, the problem with these datasets is the large amount of sarcasm present in these datasets. With sarcasm being difficult to detect by humans and Natural Language processing models this can hinder the model’s accuracy. As a result, we hoped to create an NLP model specifically designed to detect sarcasm for other NLP models. To do this, two datasets will be utilized, ISarcasm and Tweets with Sarcasm and Irony. In addition, these datasets will be trained on with five different models; Recurrent Neural Networks (RNN), Support Vector Machine (SVM), Random Forests, Decision Trees, and XGboost. By combining our model with other NLP models, we hope to increase the accuracy of these models.
Dataset
Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))
#train$tweets <- map(train$tweets, .f = function(x){
# str_squish(x)
#}) %>% unlist()
classes <- train$class %>% unique()
num_obs_train <- nrow(train)
num_obs_test <- nrow(test)
Classes: figurative, irony, regular, sarcasm
Number of observations in Train: 81408
Number of observations in Test: 8128
t <- train %>% group_by(class) %>% count()
t
train %>% filter(tweets == "" | tweets == " " | is.na(tweets))
tweet_lengths <- train$tweets %>% map(
.f = function(x){
str_count(x, pattern = " ") + 1
}) %>% unlist()
ids <- 1:nrow(train)
train_temp <- train %>% mutate(tweet_length = tweet_lengths,
id = ids)
train_temp %>% ggplot(aes(x = tweet_lengths)) +
geom_bar(aes(fill = after_stat(count)))
train_temp %>% ggplot(aes(y = tweet_lengths)) +
geom_boxplot()
*Tweet length range visualized with boxplot for each class
train_temp %>% ggplot(aes(x = class, y = tweet_lengths)) +
geom_boxplot()
Max tweet length: 67
Min tweet length: 1
Mean tweet length: 15.1798595
t <- train_temp %>% filter(tweet_lengths == max(tweet_lengths))
t
t <- train_temp %>% filter(tweet_lengths == min(tweet_lengths))
t
#------------------------------------------------------------------------------
#Function just so i don't loose my mind waiting for a function to finish
#P: Makes sure function does not print the same percentage: initialize p = 0
#outside the loop
#Length: How long the loop is
#i: the iterator
print_percent <- function(i, length, p) {
percent <- floor((i/length * 100))
if(percent %% 10 == 0 && p != percent){
print(paste0(percent,"% Complete"))
p = percent
}
return(p)
}
#------------------------------------------------------------------------------
#Seperates hashtags from text
#Takes in a column of text and returns a list of hash tags
get_hashtags_df <- function(text) {
tweets <- text
tweets_separated <- tweets %>% str_split(pattern = " ")
y <- list()
p = 0
for (i in 1:length(tweets_separated)) {
hashtags <- list()
for(k in 1:length(tweets_separated[[i]])){
if(grepl(tweets_separated[[i]][k], pattern = "#.*")){
hashtags <- append(hashtags,tweets_separated[[i]][k])
}
}
#print(hashtags)
y <- append(y,list(hashtags))
#assign("y", y, envir = .GlobalEnv)
#print(y)
p = print_percent(i,length = length(tweets_separated), p = p)
#print()#," Percent complete")
#print(tweets_separated[[i]])
}
y
}
Dir <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
train <- read_csv(paste0(Dir,"/train.csv"))
## Rows: 81408 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
test <- read_csv(paste0(Dir,"/test.csv"))
## Rows: 8128 Columns: 2
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): tweets, class
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#test <- test %>% filter(!is.na(class))
outlierSubsetFigurative <- subset(freq_figurative, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetFigurative %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
outlierSubsetIrony <- subset(freq_irony, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetIrony %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
outlierSubsetSarcasm <- subset(freq_sarcasm, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetSarcasm %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
outlierSubsetRegular <- subset(freq_regular, Freq > 851, stringsAsFactors = FALSE)
outlierSubsetRegular %>%
ggplot(aes(x = reorder(Var1, order(Freq, decreasing = TRUE)), y = Freq)) +
geom_bar(stat = 'identity') +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
frequencys <- full_join(freq_figurative,freq_irony, by = "Var1") %>%
full_join(freq_regular, by = "Var1") %>%
full_join(freq_sarcasm, by = "Var1") %>%
rename(figurative = Freq.x,
irony = Freq.y,
regular = Freq.x.x,
sarcasm = Freq.y.y)
frequencys_2_class <- full_join(freq_regular,freq_not_regular, by = "Var1") %>%
rename(regular = Freq.x,
not_regular = Freq.y)
frequencys_2_class[frequencys_2_class == 0] <- 1
frequencys_2_class[is.na(frequencys_2_class)] <- 1
frequencys[frequencys == 0] <- 1
frequencys[is.na(frequencys)] <- 1
frequencys <- frequencys %>%
mutate(figurative_prop = figurative/(irony * regular * sarcasm)) %>%
mutate(irony_prop = irony/(figurative * regular * sarcasm)) %>%
mutate(sarcasm_prop = sarcasm/(figurative * regular * irony)) %>%
mutate(regular_prop = regular/(figurative * sarcasm * irony))
frequencys_2_class <- frequencys_2_class %>%
mutate(prop = regular/not_regular) %>%
mutate(inv_prop = not_regular/regular)
max = 60
#graph_freq <- function(df, max_entries = 60) {
frequencys %>%
arrange(desc(regular_prop)) %>%
slice(1:max) %>%
ggplot(aes(y = regular_prop, x = reorder(Var1, order(regular_prop, decreasing = TRUE)))) +
geom_bar(stat='identity') +
ggtitle("regular Outliers") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
#}
frequencys_2_class %>%
arrange(desc(prop)) %>%
slice(1:max) %>%
ggplot(aes(y = prop, x = reorder(Var1, order(prop, decreasing = TRUE)))) +
geom_bar(stat='identity') +
ggtitle("Not regular Outliers") +
theme(axis.text.x = element_text(angle = 60, hjust = 1))
max = 60
frequencys_tmp_prop <- frequencys_2_class %>%
arrange(desc(prop)) %>%
slice(1:max)
frequencys_tmp_inv_prop <- frequencys_2_class %>%
arrange(desc(inv_prop)) %>%
slice(1:max)
ggplot(frequencys_tmp_prop, aes(label = Var1, size = prop)) +
geom_text_wordcloud_area(eccentricity = .54, color = 'red') +
#scale_size_area(max_size = 30) +
theme_minimal() +
ggtitle ('Words found in tweets that are considered regular')
ggplot(frequencys_tmp_inv_prop, aes(label = Var1, size = inv_prop)) +
geom_text_wordcloud_area(eccentricity = .54, color = 'blue') +
#scale_size_area(max_size = 30) +
theme_minimal() +
ggtitle ('Words found in tweets that are not considered regular (irony, figurative, sarcasm)')
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=12' are unlikely values in pixels
## Warning in png(filename = tmp_file, width = gw_pix, height = gh_pix, res =
## dev_dpi, : 'width=12, height=16' are unlikely values in pixels
preprocessing <- function(data) {
require('tm')
data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
data$tweets <- tolower(data$tweets)
data$tweets <- removePunctuation(data$tweets)
data$tweets <- removeWords(data$tweets, words = stopwords('en'))
#data$tweets <- data$tweets[data$tweets != ""]
data
}
Dir = Dir_ISarcasm
train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')
train <- preprocessing(train)
training_labels <- (train$class %>% as.array() %>% as.double())
## Warning in train$class %>% as.array() %>% as.double(): NAs introduced by
## coercion
Dir_Main <- 'Datasets/Tweets_with_Sarcasm_and_Irony'
Dir_ISarcasm <- 'Datasets/ISarcasm'
tensorflow::tf$python$client$device_lib$list_local_devices() %>% print()
## [[1]]
## name: "/device:CPU:0"
## device_type: "CPU"
## memory_limit: 268435456
## locality {
## }
## incarnation: 7655193342591569220
## xla_global_id: -1
##
##
## [[2]]
## name: "/device:GPU:0"
## device_type: "GPU"
## memory_limit: 5719982080
## locality {
## bus_id: 1
## links {
## }
## }
## incarnation: 2759015377138024622
## physical_device_desc: "device: 0, name: NVIDIA GeForce RTX 3070, pci bus id: 0000:01:00.0, compute capability: 8.6"
## xla_global_id: 416903419
#-------------------------------------------------------------------
even_out_observations <- function(data){
regular <- data %>% filter(class == 0)
sarcasm <- data %>% filter(class == 1)
#sarcasm$class = "sarcasm"
num_regular <- regular %>% nrow()
sarcasm <- sarcasm[1:num_regular,]
data <- rbind(regular,sarcasm)
data <- data[sample(1:nrow(data)), ]
data
}
#-------------------------------------------------------------------
retrieve_dataset_ISarcasm <- function(Dir = Dir_ISarcasm, binary = FALSE) {
train <- read.csv(paste0(Dir,"/train.csv"), fileEncoding = 'utf-8')# %>% rename(tweets = tweet, class = sarcastic)
test <- read.csv(paste0(Dir,"/test.csv"),fileEncoding = 'utf-8') #%>% rename(tweets = tweet, class = sarcastic)
preprocessing <- function(data) {
require('tm')
data$tweets <- data$tweets %>% sub(pattern = "@.* | @.*$", replacement = "")
data$tweets <- tolower(data$tweets)
data$tweets <- removePunctuation(data$tweets)
data$tweets <- removeWords(data$tweets, words = stopwords('en'))
#data$tweets <- data$tweets[data$tweets != ""]
data
}
train <- preprocessing(train)
test <- preprocessing(test)
factor_set <- function(set) {
set$class[set$class == 'regular'] = 0
set$class[set$class == 'sarcasm'] = 1
if(!binary) {
set$class[set$class == 'figurative'] = 2
set$class[set$class == 'irony'] = 3
} else {
set$class[set$class == 'figurative'] = 1
set$class[set$class == 'irony'] = 1
}
set
}
train <- factor_set(train)
test <- factor_set(test)
index <- createDataPartition(train$class, p = .8, list = FALSE)
train <- train[index,]
validation <- train[-index,]
training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
test_labels <- (test$class %>% as.array() %>% as.double())
list(train_set = train,
train_labels = training_labels,
test_set = test,
test_labels = test_labels,
validation_set = validation,
validation_labels = validation_labels)
}
#-------------------------------------------------------------------
#load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
retrieve_dataset <- function(Dir = 'Datasets/Tweets_with_Sarcasm_and_Irony', binary = FALSE, even_out = FALSE, without_hashtags = FALSE) {
if(!without_hashtags){
train <- read_csv(paste0(Dir,"/train.csv"))
test <- read_csv(paste0(Dir,"/test.csv"))
} else {
train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
}
#load('Datasets/Tweets_with_Sarcasm_and_Irony/test_w_hashtags.RData')
#load('Datasets/Tweets_with_Sarcasm_and_Irony/train_w_hashtags.RData')
# train <- read_csv(paste0(Dir,"/train_without_hashtags.csv"))
#test <- read_csv(paste0(Dir,"/test_without_hashtags.csv"))
preprocessing <- function(data) {
require('tm')
data$tweets <- tolower(data$tweets)
data$tweets <- removePunctuation(data$tweets)
data$tweets <- removeWords(data$tweets, words = stopwords('en'))
data
}
train <- preprocessing(train)
test <- preprocessing(test)
test <- test %>% filter(!is.na(class))
factor_set <- function(set) {
set$class[set$class == 'regular'] = 0
set$class[set$class == 'sarcasm'] = 1
if(!binary) {
set$class[set$class == 'figurative'] = 2
set$class[set$class == 'irony'] = 3
} else {
set$class[set$class == 'figurative'] = 1
set$class[set$class == 'irony'] = 1
}
set
}
train <- factor_set(train)
test <- factor_set(test)
index <- createDataPartition(train$class, p = .8, list = FALSE)
train <- train[index,]
validation <- train[-index,]
if(even_out && binary){
train <- even_out_observations(train)
test <- even_out_observations(test)
validation <- even_out_observations(validation)
}
training_labels <- (train$class %>% as.array() %>% as.double()) #normalize
validation_labels <- (validation$class %>% as.array() %>% as.double()) #normalize
test_labels <- (test$class %>% as.array() %>% as.double())
list(train_set = train,
train_labels = training_labels,
test_set = test,
test_labels = test_labels,
validation_set = validation,
validation_labels = validation_labels)
}
#--------------------------------------------------------------------------------
generate_sequences <- function(train_data,#training data
validation_data,# validation data
testing_data,
maxlen = 50,#maximum length of the embedding sequence
max_words = 2000,
tokenizer = NULL)#will only choose consider max_words amount of words for the embedding
{
training_text <- train_data$tweets %>% as.array()#get the text
validation_text <- validation_data$tweets %>% as.array()#get the text
testing_text <- testing_data$tweets %>% as.array()
if(is.null(tokenizer)) {
tokenizer <- text_tokenizer(num_words = max_words) %>%#create and fit tokenizer
fit_text_tokenizer(training_text)
print('creating Tokenizer.....')
} else {
print('found tokenizer!')
}
sequences <- texts_to_sequences(tokenizer,training_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
training_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
sequences <- texts_to_sequences(tokenizer,validation_text) #Translates text to sequences of integers(use the tokenizer$word_index to know which int maps to what word)
validation_sequences <- pad_sequences(sequences, maxlen = maxlen)#make all sequences the same length with the length being maxlen
sequences <- texts_to_sequences(tokenizer,testing_text)
testing_sequences <- pad_sequences(sequences, maxlen = maxlen)
list(train = training_sequences,
validation = validation_sequences,
test = testing_sequences,
tokenizer = tokenizer
)
}
#-------------------------------------------------------------------------------------------------------------------
Accuracy_Label_Table <- function (Labels, Guesses) {
Value_P <- function(Label, Guess){
bin <- as.integer( #Returns int equivalent of binary value Label,Guess
strtoi(
paste0(Label * 10 + Guess),
base = 2
)
)
arr <- c("TN", #Label = 0, Guess = 0
"FP", #Label = 0, Guess = 1
"FN", #Label = 1, Guess = 0
"TP" #Label = 1, Guess = 1
)
return(arr[bin+1])
}
result <- map2(.x = Labels, .y = Guesses,.f = Value_P) %>% unlist()
TN_Count <- result[result == "TN"] %>% length()
FP_Count <- result[result == "FP"] %>% length()
FN_Count <- result[result == "FN"] %>% length()
TP_Count <- result[result == "TP"] %>% length()
group = c("True Negative (TN)", #Label = 0, Guess = 0
"False Positive (FP)", #Label = 0, Guess = 1
"False Negative (FN)", #Label = 1, Guess = 0
"True Positive (TP)" #Label = 1, Guess = 1
)
value = c(TN_Count,
FP_Count,
FN_Count,
TP_Count)
data.frame(group = group,
value = value)
}
#-------------------------------------------------------------------------------
FP_Pie_Chart <- function(Labels, Guesses) {
a_table <- Accuracy_Label_Table(Labels = Labels,
Guesses = Guesses)
N_Acc <- round(a_table[1,2] / (a_table[1,2] + a_table[3,2]), digits = 4)
P_Acc <- round(a_table[4,2] / (a_table[4,2] + a_table[2,2]), digits = 4)
Acc <- round((a_table[1,2] + a_table[4,2]) / (a_table[1,2] + a_table[3,2] + a_table[4,2] + a_table[2,2]), digits = 4)
plt <- a_table %>%
ggplot(aes(x = "", y = value, fill = group)) +
geom_col() +
geom_label(aes(label = value),
position = position_stack(vjust = 0.5),
show.legend = FALSE) +
coord_polar(theta = "y") +
scale_fill_manual(values = c("#FFABAB", "#FFB092",
"#b4d4fa", "#BFFCC6"),
guide = guide_legend(reverse = TRUE)) +
ggtitle("TP, TN, FP, FN Pie Chart") +
theme_void()
plt <- ggdraw(plt)
plt <- plt +
annotation_custom(grob = textGrob(paste0("Accuracy Positive: ",P_Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025, ymax = 1- .025) +
annotation_custom(grob = textGrob(paste0("Accuracy Negative: ",N_Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .05, ymax = 1- .025 - .05) +
annotation_custom(grob = textGrob(paste0("Total Accuracy: ",Acc)), xmin = 1 - .2, xmax = 1 - .2, ymin = 1 - .025 - .1, ymax = 1- .025 - .1)
plt
}
#-------------------------------------------------------------------------------------------
one_hot_encode <- function(train,validation,test, max_words, tokenizer = NA) {
training_text <- train %>% as.array()
validation_text <- validation %>% as.array()
testing_text <- test %>% as.array()
if(!is.na(tokenizer)){
tokenizer <- text_tokenizer(num_words = max_words) %>%
fit_text_tokenizer(training_text)
}
train_one_hot_matrix <- texts_to_matrix(tokenizer, training_text, mode = "binary")#Translates text to a matrix of 0 or 1 where 0 == word NOT present and 1 == word present
#word_index <- tokenizer$word_index #The dictionary to translate a sequence to a sentence
validation_one_hot_matrix <- texts_to_matrix(tokenizer, validation_text, mode = "binary")
test_one_hot_matrix <- texts_to_matrix(tokenizer, testing_text, mode = "binary")
list(train = train_one_hot_matrix,
valdiation = validation_one_hot_matrix,
test = test_one_hot_matrix,
tokenizer = tokenizer)
}
max_words = 1000
embedding_dim = 8
maxlen = 50
sets <- retrieve_dataset_ISarcasm(binary = TRUE
)
train <- sets$train_set
training_labels <- sets$train_labels
validation <- sets$validation_set
validation_labels <- sets$validation_labels
test <- sets$test_set
test_labels <- sets$test_labels
sequences <- generate_sequences(train,
validation,
test,
maxlen = maxlen,
max_words = max_words)
training_sequences <- sequences$train
validation_sequences <- sequences$validation
test_sequences <- sequences$test
model <- keras_model_sequential() %>%
layer_embedding(input_dim = max_words,
output_dim = embedding_dim,
input_length = maxlen) %>%
bidirectional(layer_lstm(units = 128, return_sequences = TRUE))%>%
layer_lstm(units = 64, return_sequences = FALSE) %>%
layer_flatten() %>%
layer_dense(units = 1,
activation = "sigmoid")
model %>% compile(
optimizer = "rmsprop",
loss = "binary_crossentropy",
metrics = "accuracy"
)
history <- model %>% fit(
training_sequences,
training_labels,
epochs = 17,
batch_size = 128,
validation_data= list(validation_sequences,validation_labels)
)
results <- model %>% evaluate(test_sequences,test_labels)
results